package cn.doitedu;

import com.hankcs.hanlp.seg.common.Term;
import org.apache.hadoop.hive.ql.exec.UDF;
import com.hankcs.hanlp.HanLP;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Collectors;

/**
 * @Author: 深似海
 * @Site: <a href="www.51doit.com">多易教育</a>
 * @QQ: 657270652
 * @Date: 2024/4/27
 * @Desc: 学大数据，上多易教育
 *  中文分词 UDF
 **/
public class ChineseTokenizerUDF extends UDF {


    public String evaluate(String keyword){

        List<Term> segment = HanLP.segment(keyword);

        List<String> collect = segment.stream()
                .map(term -> term.word)
                .filter(word -> word.length() > 1)
                .collect(Collectors.toList());

        return String.join("|",collect);

    }


}
